PART ONE : Exploration

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(ggridges)
df<- readr::read_csv("cs_1675_fall2021_finalproject.csv", col_names = TRUE)
## Rows: 1252 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): m
## dbl (10): x1, x2, x3, x4, v1, v2, v3, v4, v5, output
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- df %>% 
      mutate(outcome = ifelse(output < 0.33, 'event', 'non_event'),
      outcome = factor(outcome, levels = c("event", "non_event"))) %>%
      mutate(y = boot::logit(output)) %>% glimpse()
## Rows: 1,252
## Columns: 13
## $ x1      <dbl> 0.025878, 0.030768, 0.019325, 0.306212, 0.031296, 0.031073, 0.…
## $ x2      <dbl> 0.255934, 0.261575, 0.020877, 0.033379, 0.259342, 0.027119, 0.…
## $ x3      <dbl> 0.492830, 0.498460, 0.258360, 0.255385, 0.264387, 0.260915, 0.…
## $ x4      <dbl> 0.012770, 0.055779, 0.012424, 0.056190, 0.056594, 0.055192, 0.…
## $ v1      <dbl> 0.275651, 0.343204, 4.998508, 5.090153, 5.031107, 9.977407, 0.…
## $ v2      <dbl> 0.033657, 0.027082, 0.030259, 0.052342, 0.517705, 0.532436, 1.…
## $ v3      <dbl> 1.166214, 1.260579, 1.298285, 1.322005, 1.368195, 1.298797, 1.…
## $ v4      <dbl> 0.408402, 0.664248, 0.412870, 0.652111, 0.533701, 0.857509, 0.…
## $ v5      <dbl> 0.525226, 2.866343, 0.409007, 0.861594, 6.451933, 0.958574, 0.…
## $ m       <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A…
## $ output  <dbl> 0.786, 0.730, 0.996, 0.326, 0.735, 0.954, 0.969, 0.986, 0.874,…
## $ outcome <fct> non_event, non_event, non_event, event, non_event, non_event, …
## $ y       <dbl> 1.3009808, 0.9946226, 5.5174529, -0.7263327, 1.0201407, 3.0320…
derived_df <- df %>% 
              mutate(x5 = 1 - (x1 + x2 + x3 + x4),
                     w = x2 / (x3 + x4),
                     z = (x1 + x2) / (x5 + x4),
                     t = v1 * v2)

Data to be explored:

Base features:
> Chemistry variables: x1:x4
> Manufacturing process variables: v1:v5, m
Derived features: x5, w, z, and t
> x5 = 1 - (x1 + x2 + x3 + x4)
> w = x2 / (x3 + x4)
> z = (x1 + x2) / (x4 + x5)
> t=v1v2
Output: output
*
Logit-transformed response: y = boot::logit(output)

Check datatypes per feature:

visdat::vis_dat(df) #checking dataypes per feature

Check the distinct values per base feature:

df %>% subset(select = -output)%>% purrr::map_dbl(n_distinct) #distinct values per feature
##      x1      x2      x3      x4      v1      v2      v3      v4      v5       m 
##    1245    1250    1250    1235    1252    1249    1252    1252    1252       5 
## outcome       y 
##       2     690

All base features have roughly the same number of observations, with the exception of m. The machine (m) input has relatively few unique values. Visually checking the counts associated with each unique value for m:

#df %>% count(m) #counts
df%>%
  subset(select = -output)%>% 
  ggplot(mapping=aes(m)) +
  geom_bar() +
  theme_bw()

Visualizing the distributions of chemistry and manufacturing process features:

df %>% 
  select(starts_with('x'), m) %>%
  tibble::rowid_to_column() %>%
  pivot_longer(starts_with('x'))%>%
  ggplot(mapping = aes(x= value)) +
  geom_histogram(bins = 21) +
  facet_wrap(~name, scales = 'free_x')

df %>% 
  select(starts_with('v'), m) %>%
  tibble::rowid_to_column() %>%
  pivot_longer(starts_with('v'))%>%
  ggplot(mapping = aes(x= value)) +
  geom_histogram(bins = 21) +
  facet_wrap(~name, scales = 'free_x')

Visualizing the distributions of derived features:

derived_df %>% 
  select(c(x5,t,w,z)) %>%
  tibble::rowid_to_column() %>%
  pivot_longer(c(x5,t,w,z))%>%
  ggplot(mapping = aes(x= value)) +
  geom_histogram(bins = 21) +
  facet_wrap(~name, scales = 'free_x')

### Distribution of the responses:

derived_df %>% 
  select(c(y,output)) %>%
  tibble::rowid_to_column() %>%
  pivot_longer(c(y,output))%>%
  ggplot(mapping = aes(x= value)) +
  geom_histogram(bins = 21) +
  facet_wrap(~name, scales = 'free_x')

The logit transformed response pushes the output to resemble a normal distribution.

Visualizing expanded features per machine:

derived_df %>% 
  select(c(x5,t,w,z), m) %>%
  tibble::rowid_to_column() %>%
  pivot_longer(c(x5,t,w,z))%>%
  ggplot(mapping = aes(x= value)) + 
  geom_freqpoly(mapping = aes(color = m, y = stat(density))) + 
  facet_wrap(~name, scales = 'free')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Visualizing output features per machine:

derived_df %>% 
  select(c(output,y), m) %>%
  tibble::rowid_to_column() %>%
  pivot_longer(c(output,y))%>%
  ggplot(mapping = aes(x= value)) + 
  geom_freqpoly(mapping = aes(color = m, y = stat(density))) + 
  facet_wrap(~name, scales = 'free')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

There seems to be some variation on the output extremes (approaching 1.0) per machine. However, the logic transformed response, y, shows no significant variation.

Visualizing chemistry & manufacturing process input per by machine:

df %>% 
  select(starts_with('x'), m) %>%
  tibble::rowid_to_column() %>%
  pivot_longer(starts_with('x'))%>%
  ggplot(mapping = aes(x= value)) + 
  geom_freqpoly(mapping = aes(color = m, y = stat(density))) + 
  facet_wrap(~name, scales = 'free')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

df %>% 
  select(starts_with('v'), m) %>%
  tibble::rowid_to_column() %>%
  pivot_longer(starts_with('v'))%>%
  ggplot(mapping = aes(x= value)) + 
  geom_freqpoly(mapping = aes(color = m, y = stat(density))) + 
  facet_wrap(~name, scales = 'free')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

No feature varies significantly according to ‘m’.

Visualizing the summary statistics of base chemistry and manufacturing features:

df %>% 
  select(starts_with('x'), m) %>%
  tibble::rowid_to_column() %>%
  pivot_longer(starts_with('x'))%>%
  ggplot(mapping = aes(x= m)) + 
  geom_boxplot(mapping = aes( y = value))+
  stat_summary(fun.data = 'mean_se', mapping = aes(y = value), fun.args = list(mult = 2))+
  facet_wrap(~name, scales = 'free')

The distributions of each chemistry & manufacturing process feature appear roughly uniform across machines; machine doesn’t appear to impact any single variable in particular.

Visualizing the summary statistics of the expanded feature set:

derived_df %>% 
  select(c(x5,t,w,z), m) %>%
  tibble::rowid_to_column() %>%
  pivot_longer(c(x5,t,w,z))%>%
  ggplot(mapping = aes(x= m)) + 
  geom_boxplot(mapping = aes( y = value))+
  stat_summary(fun.data = 'mean_se', mapping = aes(y = value), fun.args = list(mult = 2))+
  facet_wrap(~name, scales = 'free')

Features ‘t’ and ‘x5’ appear to contain high value outliers. However, machine still does not appear to affect any derived feature.

Visualizing correlation between inputs

Correlation between base & derived features:

derived_df %>%
  select(-m, -output, -outcome) %>%
  cor() %>%
  corrplot::corrplot(method = 'number',type = 'upper')

No input has significant correlation (>0.85). However, features x5 and z seem to have a moderately negative correlation with respect to each other.

Visualizing the relationship between the logit-transformed response, y, and output with respect to inputs:

Base Features:

df %>% 
  select(starts_with('x'), -x4, y) %>%
  tibble::rowid_to_column() %>% 
  pivot_longer(!c("rowid", "y")) %>% 
  ggplot(mapping = aes(x = value, y = y)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~name) +
  theme_bw()

df %>% 
  select(starts_with('x'), -x4, output) %>%
  tibble::rowid_to_column() %>% 
  pivot_longer(!c("rowid", "output")) %>% 
  ggplot(mapping = aes(x = value, y = output)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~name) +
  theme_bw()

df %>% 
  select(x4, y) %>%
  tibble::rowid_to_column() %>% 
  pivot_longer(!c("rowid", "y")) %>% 
  ggplot(mapping = aes(x = value, y = y)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~name) +
  theme_bw()

df %>% 
  select(x4, output) %>%
  tibble::rowid_to_column() %>% 
  pivot_longer(!c("rowid", "output")) %>% 
  ggplot(mapping = aes(x = value, y = output)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~name) +
  theme_bw()

df %>% 
  select(v1,v3,v5, y) %>%
  tibble::rowid_to_column() %>% 
  pivot_longer(!c("rowid", "y")) %>% 
  ggplot(mapping = aes(x = value, y = y)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~name) +
  theme_bw()

df %>% 
  select(v2,v4, y) %>%
  tibble::rowid_to_column() %>% 
  pivot_longer(!c("rowid", "y")) %>% 
  ggplot(mapping = aes(x = value, y = y)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~name) +
  theme_bw()

df %>% 
  select(v1,v3,v5, output) %>%
  tibble::rowid_to_column() %>% 
  pivot_longer(!c("rowid", "output")) %>% 
  ggplot(mapping = aes(x = value, y = output)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~name) +
  theme_bw()

df %>% 
  select(v2,v4, output) %>%
  tibble::rowid_to_column() %>% 
  pivot_longer(!c("rowid", "output")) %>% 
  ggplot(mapping = aes(x = value, y = output)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~name) +
  theme_bw()

There seems to be little to no trend with respect to ‘y’ and inputs ‘v’ and ‘x4’. There seem to be some relationship between certain values of ‘x1’, ‘x2’, and ‘x3’ that result in a minimized ‘y’.

Derived Features:

derived_df %>% 
  select(t,z, y) %>%
  tibble::rowid_to_column() %>% 
  pivot_longer(!c("rowid", "y")) %>% 
  ggplot(mapping = aes(x = value, y = y)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~name) +
  theme_bw()

derived_df %>% 
  select(t,z, output) %>%
  tibble::rowid_to_column() %>% 
  pivot_longer(!c("rowid", "output")) %>% 
  ggplot(mapping = aes(x = value, y = output)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~name) +
  theme_bw()

derived_df %>% 
  select(w,x5, y) %>%
  tibble::rowid_to_column() %>% 
  pivot_longer(!c("rowid", "y")) %>% 
  ggplot(mapping = aes(x = value, y = y)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~name) +
  theme_bw()

derived_df %>% 
  select(w,x5, output) %>%
  tibble::rowid_to_column() %>% 
  pivot_longer(!c("rowid", "output")) %>% 
  ggplot(mapping = aes(x = value, y = output)) +
  geom_point(alpha = 0.5) +
  facet_wrap(~name) +
  theme_bw()

It does nto appear that there is a relationship between values of ‘t’ that result in a minimized ‘y’. There may be a relationship between certain values of ‘w’, ‘x5’, and ‘z’ that result in a minimized ‘y’.

Visualizing the behavior between the derived binary outcome and the inputs:

derived_df %>% 
  ggplot(mapping = aes(x = outcome)) +
  geom_bar() +
  geom_text(stat = 'count',
            mapping = aes(label = stat(count)),
            color = 'red',
            nudge_y = 7,
            size = 5.5) +
  theme_bw()

Although class “event” occurs at a rate of ~50% of class “non event”, the imbalance is not substantial; subsampling should not be required.

derived_df %>% 
  select(starts_with('x'),outcome) %>%
  tibble::rowid_to_column() %>% 
  pivot_longer(!c("rowid", "outcome")) %>% 
  mutate(input_id = as.integer( stringr::str_extract(name, "\\d+") )) %>% 
  mutate(input_bin = cut(input_id,
                         breaks = quantile(input_id),
                         include.lowest = TRUE)) %>% 
  filter(input_id < 11) %>% 
  ggplot(mapping = aes(x = value, y = as.factor(input_id))) +
  geom_density_ridges(mapping = aes(fill = outcome),
                      alpha = 0.5) +
  ggthemes::scale_fill_colorblind() +
  theme_bw() +
  theme(strip.text = element_blank())
## Picking joint bandwidth of 0.0262

derived_df %>% 
  select(starts_with('v'),outcome) %>%
  tibble::rowid_to_column() %>% 
  pivot_longer(!c("rowid", "outcome")) %>% 
  mutate(input_id = as.integer( stringr::str_extract(name, "\\d+") )) %>% 
  mutate(input_bin = cut(input_id,
                         breaks = quantile(input_id),
                         include.lowest = TRUE)) %>% 
  filter(input_id < 11) %>% 
  ggplot(mapping = aes(x = value, y = as.factor(input_id))) +
  geom_density_ridges(mapping = aes(fill = outcome),
                      alpha = 0.5) +
  #facet_wrap(~input_bin, scales = "free_y") +
  ggthemes::scale_fill_colorblind() +
  theme_bw() +
  theme(strip.text = element_blank())
## Picking joint bandwidth of 0.446